For the selected models, prepare a knitr/jupyter notebook based on the following points (you can use models created in Homework 1). Submit your results on GitHub to the directory Homeworks/HW2.
Calculate Shapley values for player A given the following value function
# # A, B, C oraz A, C, B
# v(A) = 20 # +20 *2
# v(A,B) = 60 # +40
# v(A,C) = 70 # +10
# v(A,B,C) = 100 # +30 * 2
A = 1/6*(20*2+40+10+30*2)
print(A)
# # B, A, C oraz B, C, A
# v(B) = 20 # +20 *2
# v(B,A) = 60 # +40
# v(B,C) = 70 # +10
# v(B,A,C) = 100 # +30 * 2
B = 1/6*(20*2+40+10+30*2)
print(B)
# # C, A, B oraz C, B, A
# v(C) = 60 # +60 *2
# v(A,C) = 70 # +50
# v(B,C) = 70 # +50
# v(A,B,C) = 100 # +40 * 2
C = 1/6*(60*2+50+50+40*2)
print(C)
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import dalex as dx
import xgboost as xgb
import shap
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import plotly.io as pio
pio.renderers.default = "notebook"
import platform
print(f'Python {platform.python_version()}')
{package.__name__: package.__version__ for package in [dx, xgb, shap, sklearn, pd, np]}
path = r'C:\Users\Mateusz\Documents\Studia_nauka\ML_MIM\xai_mim\brain_stroke.csv'
df = pd.read_csv(path)
# Data preview
df.tail()
df.describe().round(2)
# Null check
df.isnull().all()
# Data type check
df.info()
df.loc[:, df.dtypes == 'object'] =\
df.select_dtypes(['object'])\
.apply(lambda x: x.astype('category'))
df.info()
X = df.drop(columns='stroke')
# convert gender to binary only because the `max_cat_to_onehot` parameter in XGBoost is yet to be working properly..
X = pd.get_dummies(X, columns=['gender', 'ever_married', 'Residence_type'], drop_first=True)
y = df.stroke
model_categorical = xgb.XGBClassifier(
n_estimators=200,
max_depth=4,
use_label_encoder=False,
eval_metric="logloss",
enable_categorical=True,
tree_method="hist", seed=77
)
model_categorical.fit(X, y)
model_categorical.predict_proba(X.iloc[0:2])
Some Explanations
model = model_categorical
def pf_xgboost_classifier_categorical(model, df):
df.loc[:, df.dtypes == 'object'] =\
df.select_dtypes(['object'])\
.apply(lambda x: x.astype('category'))
return model.predict_proba(df)[:, 1]
explainer = dx.Explainer(model, X, y, predict_function=pf_xgboost_classifier_categorical, label="GBM")
explainer.model_performance()
Select two observations from the dataset and calculate the model's prediction. Select I have chosen observations 0 and 1.
df.iloc[0]
print("Pobability of the stroke for the first observation equals : ", explainer.predict(X.iloc[[0]]))
df.iloc[1]
print("Pobability of the stroke for the second observation equals : ", explainer.predict(X.iloc[[1]]))
Next, for the same observations, calculate the decomposition of predictions, so-called variable attributions, using SHAP from two packages of choice, e.g. for Python: dalex and shap, for R: DALEX and iml.
shap_attributions = [explainer.predict_parts(X.iloc[[i]], type="shap", label=f'patient {i}') for i in range(2)]
shap_attributions[0].plot(shap_attributions[1::])
For the first patient probability of having a stroke is mostley affected by the heart_disease=1 and age=67 factors. Factors which has least impact on the final probability predictions are residence_type_urban=1 and having no hypertensions. The final probability of having a stroke for that patient equals 70%.
For the second patient probability of having a stroke is at most affected affected by the age=80 factors. Hovever, smoking_status = never smoked to some extent decreases the probability of having a stroke. Factors which has least impact on the final probability predictions are having no hypertensions and residence_type_urban=1. The final probability of having a stroke for that patient equals 53%.
bd_attributions = [explainer.predict_parts(X.iloc[[i]], type="break_down", label=f'patient {i}') for i in range(2)]
bd_attributions[0].plot(bd_attributions[1::])
Above we can see the break down plots for the observation o and 1
Find any two observations in the dataset, such that they have different variables of the highest importance, e.g. age and gender have the highest (absolute) attribution for observation A, but race and class are more important for observation B.
I have found, that bobservations 1 and 2 have different variables of highest importance - for the observation 1 the two most importsnt variables are age and smoking status, whereas for the observation2 it is avg_glucose_level and private work type.
shap_attributions12 = [explainer.predict_parts(X.iloc[[i]], type="shap", label=f'patient {i}') for i in range(1,3)]
shap_attributions12[0].plot(shap_attributions12[1::])
(If possible) Select one variable X and find two observations in the dataset such that for one observation, X has a positive attribution, and for the other observation, X has a negative attribution.
Observations 2 and 12 has different attribution values for the BMI index: Observationstion 2 has a negative attribution, whereas the observation 12 has a positive attribution for BMI index.
df.iloc[2]
df.iloc[12]
shap_attributions2_12 = [explainer.predict_parts(X.iloc[[i]], type="shap", label=f'patient {i}') for i in [2,12]]
shap_attributions2_12[0].plot(shap_attributions2_12[1::])
(How) Do the results differ across the two packages selected in point (3)?
Below are different plots: waterfall plots, beeswarm, scatterplots, and force plots, which find more informatic that the break down type of plots. We can infer much more information from them
categorical_variables = ['work_type', 'smoking_status']
X_ohe = pd.get_dummies(X, columns=categorical_variables, drop_first=True)
X_ohe
model_ohe = xgb.XGBClassifier(
n_estimators=200,
max_depth=4,
use_label_encoder=False,
eval_metric="logloss"
)
model_ohe.fit(X_ohe, y)
shap_explainer = shap.explainers.Tree(model_ohe, data=X_ohe, model_output="probability")
shap_values = shap_explainer(X_ohe)
shap_values
for i in range(2):
shap.plots.waterfall(shap_values[i])
shap.plots.beeswarm(shap_values, max_display=10, plot_size=(9, 6))
import matplotlib.pyplot as plt
shap.plots.bar(shap_values, max_display=10, show=False)
plt.gcf().set_size_inches(9, 6)
plt.show()
shap.plots.scatter(shap_values[:, "age"], show=False)
plt.gcf().set_size_inches(9, 6)
plt.show()
shap.plots.scatter(shap_values[:, "age"], color=shap_values[:, "gender_Male"], show=False)
plt.gcf().set_size_inches(11, 6)
plt.show()
shap.plots.scatter(shap_values[:, "age"], color=shap_values[:, "bmi"], show=False)
plt.gcf().set_size_inches(11, 6)
plt.show()
shap.plots.scatter(shap_values[:, "age"], color=shap_values[:, "avg_glucose_level"], show=False)
plt.gcf().set_size_inches(11, 6)
plt.show()
from sklearn.svm import SVC
svm_ohe = SVC(probability=True)
svm_ohe.fit(X_ohe.values, y)
X_subset = X_ohe.sample(111, random_state=0)
exp_svm = dx.Explainer(svm_ohe, X_subset, label="SVM", verbose=False)
exp_xgboost = dx.Explainer(model_ohe, X_subset, label="GBM", verbose=False)
sv_svm = [exp_svm.predict_parts(X_ohe.iloc[[i]], type="shap_wrapper") for i in range(2)]
sv_xgboost = [exp_xgboost.predict_parts(X_ohe.iloc[[i]], type="shap_wrapper") for i in range(2)]
for i in range(2):
print(f'==== Patient {i} ====')
print('---- XGBoost:')
sv_xgboost[i].plot()
print('---- SVM:')
sv_svm[i].plot()
Here We can see huge differences betweent the SGBoost and SVM models in creating the explanations for first two observations
shap.initjs()
shap.plots.force(shap_values[:500])